package com.kdtech.analyse.NewsPaper;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.crawler.CrawlHTML;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.HtmlCleaner;
import com.kdtech.utils.RegexUtils;
import com.kdtech.utils.StringUtils;

/**
 * 长江日报  499
 * http://bcrbszb.chinajilin.com.cn/ 白城日报 691
 * http://cbsrbszb.chinajilin.com.cn/吉林-长白山日报 684
 * http://lyrbszb.chinajilin.com.cn/吉林-辽源日报 685
 * http://jcrbszb.chinajilin.com.cn/吉林-江城日报 686
 * http://sprbszb.chinajilin.com.cn/吉林-四平日报 687
 * http://syrbszb.chinajilin.com.cn/吉林-松原日报 688
 * http://thrbszb.chinajilin.com.cn/吉林-通化日报 689
 * http://ybrbszb.chinajilin.com.cn/吉林-延边日报 690
 * @author KK
 *
 */
public class ChinajilinNewsPaperAnalyse implements AnalyseNews {


	
	public boolean isDetailPage(String url) {
		String[] regex = {
				"http://cswbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm(\\?div=-1)?",
				"http://jlrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm(\\?div=-1)?",
				"http://bcrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm(\\?div=-1)?",
				"http://cbsrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm(\\?div=-1)?",
				"http://lyrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm(\\?div=-1)?",
				"http://jcrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm(\\?div=-1)?",
				"http://sprbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm(\\?div=-1)?",
				"http://syrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm(\\?div=-1)?",
				"http://thrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm(\\?div=-1)?",
				"http://www.hybrb.com/dzb/home/news.asp\\?pid=[0-9]*&gid=[0-9]*&nid=[0-9]*",
				"http://ybrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/content_[0-9]*.htm(\\?div=-1)?",
				"http://szb.cbsrb.com/shtml/cbsrb/[0-9]{8}/[0-9].*.shtml",
				"http://szb.hybrb.com/dzb/home/news.asp.*?gid=[0-9].*&nid=[0-9].*&pid=[0-9].*"

				};
		return RegexUtils.matchAny(url, regex);
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {

		String html = urlMeta.getHtml();
		String url = urlMeta.getUrl();


		String title = null;
		String content = null;
		Long date = null;
		Document doc = Jsoup.parse(html);

		title =doc.select("html body table tbody tr td table tbody tr td table tbody tr td strong:gt(1)").text();
		if (title!=null){
			title = StringUtils.substringBefore(title, "::");
		}
		if(StringUtils.isBlank(title)){
			title = doc.select("div.title").text();
		}
		if(StringUtils.isBlank(title)){
			title = doc.select("html body table tbody tr td table.border-grey tbody tr td#DivDisplay.f-14 div.f-20 strong").text();
		}
		if(StringUtils.isBlank(title)){
			title = doc.select("h1").text();
		}
		
		date=DateUtils.matchDate(url);
		if(date==null){
			date=DateUtils.matchDate(doc.select("div.content").text());
		}
		content =HtmlCleaner.getContentHtml(url,doc.select("div#ozoom"));
		if(StringUtils.isBlank(content)){
			content = HtmlCleaner.getContentHtml(url,doc.select("div.content"));
		}
		if(StringUtils.isBlank(content)){
			content = HtmlCleaner.getContentHtml(url,doc.select("html body table tbody tr td table.border-grey tbody tr td#DivDisplay.f-14 div.f-14 p"));
		}
		if(StringUtils.isBlank(content)){
			content = HtmlCleaner.getContentHtml(url,doc.select("div#content_div"));
		}
			NewsMeta newspaper = new NewsMeta();
			newspaper.setUrl(url);


			newspaper.setTitle(StringUtils.trimSpace(title));
			newspaper.setContent(StringUtils.trimSpace(content));
			newspaper.setDate(date);
			return newspaper;


	}

	
	public boolean isTaskPage(String url) {
		/*
		 * http://bcrbszb.chinajilin.com.cn/html/2012-12/04/node_247.htm
		 * http://cbsrbszb.chinajilin.com.cn/html/2012-12/04/node_102.htm
		 * http://lyrbszb.chinajilin.com.cn/html/2012-12/04/node_206.htm
		 * http://jcrbszb.chinajilin.com.cn/html/2012-12/03/node_146.htm
		 * http://sprbszb.chinajilin.com.cn/html/2012-12/03/node_186.htm
		 * http://syrbszb.chinajilin.com.cn/html/2012-12/03/node_3.htm
		 * http://thrbszb.chinajilin.com.cn/html/2012-12/03/node_23.htm
		 * http://ybrbszb.chinajilin.com.cn/html/2011-07/13/node_226.htm
		 */
		String[] regex = {
				"http://cswbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm",
				"http://jlrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm",
				"http://bcrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm",
				"http://cbsrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm",
				"http://lyrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm",
				"http://jcrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm",
				"http://sprbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm",
				"http://syrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm",
				"http://thrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm",
				"http://ybrbszb.chinajilin.com.cn/html/[0-9]{4}-[0-9]{2}/[0-9]{2}/node_[0-9]*.htm",
				"http://www.hybrb.com/dzb/home/index.asp\\?pid=[0-9]*&gid=[0-9]*",
				"http://szb.cbsrb.com/shtml/cbsrb/[0-9]{8}/v.*.shtml",
				"http://szb.hybrb.com/dzb/home/index.asp*gid=[0-9].*&pid=[0-9].*",
		};
		return RegexUtils.matchAny(url, regex);
	}

	
}
